January 22, 2016

Getting NBA Data

Lots of code, so we'll just skim through this.

Getting data from stats.nba.com into R

df_from_url = function(url, ncol, number_columns) {
    json = fromJSON(file = url, method = "C")
    df = data.frame(matrix(unlist(json$resultSets[[1]][[3]]), 
        ncol = ncol, byrow = TRUE), stringsAsFactors = FALSE)
    colnames(df) = json$resultSets[[1]][[2]]
    df[, number_columns] = apply(df[, number_columns], 
        2, function(x) as.numeric(as.character(x)))
    return(df)
}

Some Setup

Years

years = sapply(2007:2015, function(year) sprintf("%4d-%02d", 
    year, (year + 1)%%100))

Team Ids

team_fmt = "http://stats.nba.com/stats/leaguedashteamstats?Conference=&DateFrom=&DateTo=&Division=&GameScope=&GameSegment=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Plays&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=%s&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision="
team_urls = sapply(years, function(year) sprintf(team_fmt, 
    year))
team_dfs = sapply(team_urls, function(url) df_from_url(url, 
    30, c(1, 3:29)))
team_ids = Reduce(union, team_dfs[1, ])

Player Data

columns = c(35, 32, 24, 27, 30)
stat_types = c("Base", "Advanced", "Misc", "Scoring", 
    "Usage")
player_fmt = "http://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=%s&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Plays&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=%s&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight="

players = NULL
for (year in years) {
  season_df = NULL
  for (i in 1:length(stat_types)) {
    stat_type = stat_types[i]
    c = columns[i]
    numeric_columns = c(1,3,5:(c-1))
    url = sprintf(player_fmt, stat_type, year)
    df = df_from_url(url, c, numeric_columns)
    if (is.null(season_df)) {
      season_df = df
    } else {
      season_df = merge(
        season_df, df, by=1, all.x=TRUE, suffixes=c('', sprintf('_%s', stat_type))
      )
    }
  }

  season_df$SEASON = factor(year)
  if (is.null(players)) {
    players = season_df
  } else {
    players = rbind(players, season_df)
  }
}

Lineup Data

stat_types = c("Base", "Advanced", "Four+Factors", 
    "Misc", "Scoring", "Opponent")
columns = c(31, 24, 18, 18, 25, 31)
lineup_fmt = "http://stats.nba.com/stats/leaguedashlineups?Conference=&DateFrom=&DateTo=&Division=&GameID=&GameSegment=&GroupQuantity=5&LastNGames=0&LeagueID=00&Location=&MeasureType=%s&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=Per100Plays&Period=0&PlusMinus=N&Rank=N&Season=%s&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&TeamID=%d&VsConference=&VsDivision="

lineups = NULL
for (year in years) {
  for (team in team_ids) {
    season_df = NULL
    for (i in 1:length(stat_types)) {
      stat_type = stat_types[i]
      c = columns[i]
      numeric_columns = c(4,6:c)
      url = sprintf(lineup_fmt, stat_type, year, team)
      df = df_from_url(url, c, numeric_columns)
      if (is.null(season_df)) {
        season_df = df
      } else {
        season_df = merge(
          season_df, df, by=2, all.x=TRUE, suffixes=c('', sprintf('_%s', stat_type))
          )
      }
    }

    season_df$SEASON = factor(year)
    if (is.null(lineups)) {
      lineups = season_df
    } else {
      lineups = rbind(lineups, season_df)
    }
  }
}

Player Cleanup

players = mutate(players, MIN_TOTAL = MIN_Usage, MIN_GAME = MIN_Advanced)
players = select(players, -X, -matches("CFID|CFPARAMS|_[A-Z][a-z]", 
    FALSE))

Lineup Cleanup

lineups = tbl_df(lineups)
lineups = mutate(lineups, MIN_TOTAL = MIN_Advanced)
lineups = select(lineups, -X, -matches("GROUP_SET|CFID|CFPARAMS|_[A-Z][a-z]", 
    FALSE))
lineups = Filter(function(x) !all(is.na(x)), lineups)

Identifying players

lineups$PLAYERS = t(sapply(lineups$GROUP_ID, function(x) {
    as.integer(unlist(strsplit(as.character(x), split = " - ")))
}))

Calculating Player Averages for a Lineup

season_col = grep("SEASON", colnames(lineups))
player_col = grep("PLAYERS", colnames(lineups))
numeric_player_columns = as.vector(which(sapply(players, 
    is.numeric)))
lineup_averages = data.frame(t(apply(lineups, 1, function(x) {
    srows = players$SEASON == x[season_col]
    prows = players$PLAYER_ID %in% as.numeric(x[player_col:player_col + 
        4])
    sapply(players[srows & prows, numeric_player_columns], 
        mean)
})))

Calculating Usage-Weighted Averages for a Lineup

usg_weighted = data.frame(t(apply(lineups, 1, function(x) {
    srows = players$SEASON == x[season_col]
    prows = players$PLAYER_ID %in% as.numeric(x[player_col:player_col + 
        4])
    stats = players[srows & prows, numeric_player_columns]
    tot_usg = sum(stats$USG_PCT_PCT)
    sapply(stats, function(y) sum(y * stats$USG_PCT_PCT)/tot_usg)
})))

Putting it all together

colnames(lineups) = paste(names(lineups), "lineup", 
    sep = ".")
colnames(lineup_averages) = paste(names(lineup_averages), 
    "player", sep = ".")
colnames(usg_weighted) = paste(names(usg_weighted), 
    "usage", sep = ".")
nba = merge(merge(lineups, lineup_averages, by = 0), 
    usg_weighted, by = 0)
nba$Row.names = NULL
nba$Row.names = NULL
dim(nba)
## [1] 66784   259

Net Rating

Data Issues?

Final Cleanup

mins_ok = nba$MIN_TOTAL.lineup > 4
ratio_ok = nba$MIN_TOTAL.lineup/nba$MIN_TOTAL.player <= 
    0.1
lineup_ok = nba$NET_RATING.lineup > -50 & nba$NET_RATING.lineup < 
    50
player_ok = nba$NET_RATING.player > -10 & nba$NET_RATING.player < 
    10
nba = nba[mins_ok & ratio_ok & lineup_ok & player_ok, 
    ]

I'm not sure any of this would stand up to statistical scrutiny, but it should be okay for drawing pretty pictures.

Now What?

GROUP_ID.lineup SEASON.lineup PTS.lineup PTS.player PTS.usage
2551 - 101122 - 101127 - 201177 - 2211 2008-09 106.1 18.3 4.5
201589 - 200768 - 201564 - 2545 - 2624 2008-09 90.3 10.6 16.7
201147 - 2744 - 201567 - 201196 - 2863 2009-10 97.6 12.4 14.9
201147 - 2744 - 2545 - 201196 - 2863 2009-10 85.8 12.4 14.9
201567 - 2545 - 2562 - 201196 - 2863 2009-10 113.4 12.4 17.3
201605 - 2562 - 200762 - 201196 - 2863 2009-10 89.2 12.4 17.1

  • .lineup indicates stats for a given lineup.
  • .player indicates the average (full-season)stats for the players in a lineup.
  • .usage indicates the usage-weighted (sull-season) stats for the players in a lineup.

Efficiency vs. Usage Tradeoff

Players

Lineups

Usage-Weighting

Eli Witus

Turnovers? Assists?

Rebounding

Rebound value

Offense vs. Defense

Diminishing Returns

3 Point Shooting

Historic Volume

Is it helping?

Is this just the effect of time?

More shots == more makes

We need more shooters!

What's going on?

More Shooting != More Shooting

Conclusion

  • I have a pretty cool data set.
  • I'm pretty good at manipulating data.
  • I need some help with the analysis.
  • Basketball is hard and I'm questioning some of my assumptions.